3  Riesgo hídrico en barrios populares

Una comparación de metodologías analíticas en el Partido de La Plata

3.1 Resumen Ejecutivo

3.2 Motivos y Objetivos

3.3 Datos, Metodología y Limitaciones

3.4 Resultados

Mostrar el código
import pandas as pd
import geopandas as gpd
import requests
from io import StringIO

import boto3
import duckdb


import matplotlib.pyplot as plt

import numpy as np
import s2sphere
from botocore.config import Config
from rasterstats import zonal_stats


from shapely.geometry import box

USE_CRS = "EPSG:5349"
Mostrar el código
response = requests.get(
    "https://www.argentina.gob.ar/sites/default/files/renabap-2023-12-06.geojson"
)
renabap = gpd.read_file(StringIO(response.text))
renabap_pba = renabap[renabap["provincia"] == "Buenos Aires"]
renabap_pba = renabap_pba.to_crs(USE_CRS)


peligro_path = "/home/nissim/Documents/dev/ciut-inundaciones/data/la_plata_pelig_2023_datos_originales.geojson"
peligro = gpd.read_file(peligro_path)
peligro = peligro.to_crs(USE_CRS)

# Get the bounds of the peligro layer
peligro_bounds = peligro.total_bounds
peligro_bbox = box(*peligro_bounds)

# Filter renabap_pba to only include geometries that intersect with the peligro bounds
renabap_pba_intersect = renabap_pba[
    renabap_pba.geometry.intersects(peligro_bbox)
].copy()

# make sure all geometries are valid
renabap_pba_intersect = renabap_pba_intersect[renabap_pba_intersect.geometry.is_valid]

3.4.1 Interpolación areal

Mostrar el código
import geopandas as gpd
from tobler.area_weighted import area_interpolate

# Ensure both GeoDataFrames have the same CRS
if renabap_pba_intersect.crs != peligro.crs:
    peligro = peligro.to_crs(renabap_pba_intersect.crs)

# Get unique hazard levels
hazard_levels = peligro["PELIGROSID"].unique()

# Initialize output columns in renabap_pba_intersect
for level in hazard_levels:
    renabap_pba_intersect[f"porcion_{level}"] = 0.0

# For each hazard level, calculate the portion of each barrio that falls within it
for level in hazard_levels:
    # Filter hazard polygons for this level
    hazard_subset = peligro[peligro["PELIGROSID"] == level].copy()

    if not hazard_subset.empty:
        # Add dummy variable with value 1 for each hazard polygon
        hazard_subset["hazard_area"] = 1

        # Interpolate hazard area to barrios (this gives us the proportion)
        results = area_interpolate(
            source_df=hazard_subset,
            target_df=renabap_pba_intersect,
            extensive_variables=["hazard_area"],
        )

        # This gives us the portion of each barrio that overlaps with this hazard level
        renabap_pba_intersect[f"porcion_{level}"] = results["hazard_area"]

# Calculate families exposed to each hazard level
for level in hazard_levels:
    renabap_pba_intersect[f"familias_expuestas_{level}"] = (
        renabap_pba_intersect[f"porcion_{level}"]
        * renabap_pba_intersect["familias_aproximadas"]
    )

# Create tidy dataframe with the three required columns
renabap_tidy = renabap_pba_intersect[["nombre_barrio"]].copy()

# Calculate total familias_expuestas
renabap_tidy["fam_expuestas_areal"] = (
    renabap_pba_intersect["familias_expuestas_alta"]
    + renabap_pba_intersect["familias_expuestas_baja"]
    + renabap_pba_intersect["familias_expuestas_media"]
)

# Determine highest hazard level (peligrosidad)
def get_highest_hazard(row):
    exposures = [
        row["familias_expuestas_alta"],
        row["familias_expuestas_baja"],
        row["familias_expuestas_media"],
    ]
    hazard_levels = ["alta", "baja", "media"]
    return hazard_levels[exposures.index(max(exposures))]

renabap_tidy["peligrosidad"] = renabap_pba_intersect.apply(get_highest_hazard, axis=1)

# Round fam_expuestas_areal
renabap_tidy["fam_expuestas_areal"] = renabap_tidy["fam_expuestas_areal"].round(2)

3.4.2 Mapeo dasymetrico con datos GHSL

Mostrar el código
import rasterstats
import rioxarray
from shapely.geometry import box

# Load GHSL data with dask chunking for memory efficiency
ghsl = rioxarray.open_rasterio(
    "/home/nissim/Downloads/spatial/GHS_POP_E2025_GLOBE_R2023A_54009_100_V1_0_R14_C13/GHS_POP_E2025_GLOBE_R2023A_54009_100_V1_0_R14_C13.tif",
    chunks={"x": 1024, "y": 1024},  # Adjust chunk size based on your memory constraints
)

# Reproject to your target CRS with streaming
ghsl = ghsl.rio.reproject(dst_crs=USE_CRS)

# Clip to renabap_pba_intersect bounding box using streaming
bounding_box = box(
    *renabap_pba_intersect.total_bounds
)  # Create a box from the bounding box coordinates

ghsl_clipped = ghsl.rio.clip(
    [bounding_box],  # Use the bounding box as a geometry (wrapped in a list)
    from_disk=True,  # Process from disk to avoid loading entire dataset into memory
)


# Convert to the format expected by rasterstats
geometries = [geom for geom in renabap_pba_intersect.geometry]

# Use rasterstats for vectorized zonal statistics
stats = rasterstats.zonal_stats(
    geometries,
    ghsl_clipped.values[0],  # rasterstats expects 2D array
    affine=ghsl_clipped.rio.transform(),
    stats=["sum"],
    nodata=ghsl_clipped.rio.nodata,
)

# Extract the sum values
ghsl_totals = [stat["sum"] if stat["sum"] is not None else 0 for stat in stats]

# Add the GHSL population estimates as a new column
renabap_pba_intersect["ghsl_pop_est"] = ghsl_totals


from rasterio.features import rasterize
import numpy as np

# Get the reference raster properties from GHSL data
reference_raster = ghsl_clipped
reference_transform = reference_raster.rio.transform()
reference_crs = reference_raster.rio.crs
reference_shape = reference_raster.shape[1:]  # Get 2D shape (height, width)

# Prepare geometries and values for rasterization
geometries_ghsl = [
    (geom, value)
    for geom, value in zip(
        renabap_pba_intersect.geometry, renabap_pba_intersect["ghsl_pop_est"]
    )
]
geometries_familias = [
    (geom, value)
    for geom, value in zip(
        renabap_pba_intersect.geometry, renabap_pba_intersect["familias_aproximadas"]
    )
]

# Create GHSL population raster
ghsl_pop_raster = rasterize(
    geometries_ghsl,
    out_shape=reference_shape,
    transform=reference_transform,
    fill=0,
    dtype=np.float32,
    all_touched=False,
)

# Create familias aproximadas raster
familias_raster = rasterize(
    geometries_familias,
    out_shape=reference_shape,
    transform=reference_transform,
    fill=0,
    dtype=np.float32,
    all_touched=False,
)

# Step 1: Divide original GHSL by the barrio-level GHSL to get fractional population
# Handle division by zero and nodata values properly
mask = (ghsl_clipped.values[0] > 0) & (ghsl_pop_raster > 0.1)
ghsl_fractional = np.full_like(ghsl_clipped.values[0], -200, dtype=np.float64)
ghsl_fractional[mask] = ghsl_clipped.values[0][mask] / ghsl_pop_raster[mask]

# Step 2: Multiply fractional population by familias aproximadas to get downscaled data
mask2 = (ghsl_fractional > 0) & (familias_raster > 0)
familias_downscaled = np.full_like(ghsl_clipped.values[0], -200, dtype=np.float64)
familias_downscaled[mask2] = ghsl_fractional[mask2] * familias_raster[mask2]


# Check that the sum of downscaled familias equals the original familias aproximadas
total_original_familias = renabap_pba_intersect["familias_aproximadas"].sum()
valid_downscaled = familias_downscaled[familias_downscaled != -200]
total_downscaled_familias = np.sum(valid_downscaled)


# Step 1: Divide original GHSL by the barrio-level GHSL to get fractional population
# Use masking to avoid division on invalid cells
mask = (ghsl_clipped.values[0] != -200) & (ghsl_pop_raster > 0.1)
ghsl_fractional = np.full_like(ghsl_clipped.values[0], -200, dtype=np.float64)
ghsl_fractional[mask] = ghsl_clipped.values[0][mask] / ghsl_pop_raster[mask]

# Step 2: Multiply fractional population by familias aproximadas to get downscaled data
mask2 = (ghsl_fractional != -200) & (familias_raster > 0)
familias_downscaled = np.full_like(ghsl_clipped.values[0], -200, dtype=np.float64)
familias_downscaled[mask2] = ghsl_fractional[mask2] * familias_raster[mask2]

# Verify the results - exclude -200 from range calculations
ghsl_valid = ghsl_clipped.values[0] != -200
fractional_valid = ghsl_fractional != -200
downscaled_valid = familias_downscaled != -200

# Check that the sum of downscaled familias equals the original familias aproximadas
total_original_familias = renabap_pba_intersect["familias_aproximadas"].sum()
total_downscaled_familias = np.sum(familias_downscaled[downscaled_valid])
print(f"\nTotal original familias: {total_original_familias:,.0f}")
print(f"Total downscaled familias: {total_downscaled_familias:,.0f}")
print(f"Difference: {abs(total_original_familias - total_downscaled_familias):,.2f}")

# Intersect settlements with hazard zones
settlement_hazard = gpd.overlay(renabap_pba_intersect, peligro, how="intersection")

# Create GHSL tidy dataframe with matching structure
ghsl_tidy = []

for idx, row in settlement_hazard.iterrows():
    stats = zonal_stats(
        [row.geometry],
        familias_downscaled,  # your numpy array
        affine=reference_transform,  # get transform from your xarray
        stats=["sum"],
        nodata=-200,  # use your actual nodata value
    )[0]

    ghsl_tidy.append(
        {
            "nombre_barrio": row["nombre_barrio"],
            "peligrosidad": row["PELIGROSID"],
            "fam_expuestas_ghsl": stats["sum"] if stats["sum"] is not None else 0,
        }
    )

ghsl_tidy = pd.DataFrame(ghsl_tidy)

Total original familias: 88,856
Total downscaled familias: 88,680
Difference: 176.00

3.4.3 Estimaciones según cantidad de edificios

Mostrar el código
def fetch_buildings(geodataframe, temp_file="buildings_filtered.parquet"):
    """Fetch building data for a given GeoDataFrame region"""

    # Get S2 cell and bounds
    center = geodataframe.to_crs("epsg:3857").union_all().centroid
    center_wgs84 = (
        gpd.GeoDataFrame(geometry=[center], crs="EPSG:3857")
        .to_crs(epsg=4326)
        .geometry.iloc[0]
    )
    cell = s2sphere.CellId.from_lat_lng(
        s2sphere.LatLng.from_degrees(center_wgs84.y, center_wgs84.x)
    ).parent(10)
    bounds = geodataframe.to_crs("epsg:4326").total_bounds

    # Find matching S2 partition
    s3 = boto3.client(
        "s3",
        endpoint_url="https://data.source.coop",
        aws_access_key_id="",
        aws_secret_access_key="",
        config=Config(s3={"addressing_style": "path"}),
    )

    partitions = {
        obj["Key"].split("/")[-1].replace(".parquet", "")
        for obj in s3.list_objects_v2(
            Bucket="vida",
            Prefix="google-microsoft-open-buildings/geoparquet/by_country_s2/country_iso=ARG/",
        ).get("Contents", [])
    }

    parent_id = next(
        str(cell.parent(level).id())
        for level in range(10, 0, -1)
        if str(cell.parent(level).id()) in partitions
    )

    # Setup DuckDB and query
    con = duckdb.connect()
    for cmd in [
        "INSTALL spatial",
        "LOAD spatial",
        "INSTALL httpfs",
        "LOAD httpfs",
        "SET s3_region='us-east-1'",
        "SET s3_endpoint='data.source.coop'",
        "SET s3_use_ssl=true",
        "SET s3_url_style='path'",
    ]:
        con.execute(cmd)

    # Export and read back
    query = f"""
    COPY (SELECT * FROM 's3://vida/google-microsoft-open-buildings/geoparquet/by_country_s2/country_iso=ARG/{parent_id}.parquet'
          WHERE bbox.xmax >= {bounds[0]} AND bbox.xmin <= {bounds[2]} AND
                bbox.ymax >= {bounds[1]} AND bbox.ymin <= {bounds[3]}
    ) TO '{temp_file}' (FORMAT PARQUET);
    """

    con.execute(query)
    df = pd.read_parquet(temp_file)
    df["geometry"] = gpd.GeoSeries.from_wkb(df["geometry"])

    return gpd.GeoDataFrame(df, geometry="geometry", crs="EPSG:4326")


# Usage:
buildings = fetch_buildings(renabap_pba_intersect)

# Reproject buildings to match the analysis CRS
buildings_proj = buildings.to_crs(USE_CRS)

# Step 1: Calculate buildings per settlement-hazard intersection
buildings_hazard = gpd.overlay(buildings_proj, settlement_hazard, how="intersection")

# Count buildings per settlement-hazard combination
buildings_per_hazard = (
    buildings_hazard.groupby(["nombre_barrio", "PELIGROSID"])
    .size()
    .reset_index(name="buildings_count")
)

# Step 2: Calculate total buildings per settlement (barrio popular)
buildings_settlement = gpd.overlay(
    buildings_proj, renabap_pba_intersect, how="intersection"
)
total_buildings_per_settlement = (
    buildings_settlement.groupby("nombre_barrio")
    .size()
    .reset_index(name="total_buildings")
)

# Step 3: Merge and calculate ratios
hazard_ratios = buildings_per_hazard.merge(
    total_buildings_per_settlement, on="nombre_barrio", how="left"
)
hazard_ratios["building_ratio"] = (
    hazard_ratios["buildings_count"] / hazard_ratios["total_buildings"]
)

# Step 4: Get total population per settlement and apply ratios
settlement_population = renabap_pba_intersect[
    ["nombre_barrio", "familias_aproximadas"]
].copy()

# Merge with ratios and calculate population estimates
population_estimates = hazard_ratios.merge(
    settlement_population, on="nombre_barrio", how="left"
)
population_estimates["estimated_population_hazard"] = (
    population_estimates["building_ratio"]
    * population_estimates["familias_aproximadas"]
)

# Step 5: Create final results with totals
final_results = population_estimates[
    ["nombre_barrio", "PELIGROSID", "estimated_population_hazard"]
].copy()

# Add total population rows (no hazard breakdown)
total_pop_rows = settlement_population.copy()
total_pop_rows["PELIGROSID"] = "total"
total_pop_rows["estimated_population_hazard"] = total_pop_rows["familias_aproximadas"]

# Combine
final_results = pd.concat(
    [
        final_results,
        total_pop_rows[["nombre_barrio", "PELIGROSID", "estimated_population_hazard"]],
    ],
    ignore_index=True,
)

# Create buildings tidy dataframe with matching structure
buildings_tidy = final_results[
    ["nombre_barrio", "PELIGROSID", "estimated_population_hazard"]
].copy()

# Rename columns to match the structure
buildings_tidy = buildings_tidy.rename(
    columns={
        "PELIGROSID": "peligrosidad",
        "estimated_population_hazard": "fam_expuestas_edificios",
    }
)

# Filter out the 'total' rows since we only want hazard-specific data
buildings_tidy = buildings_tidy[buildings_tidy["peligrosidad"] != "total"].copy()

3.4.4 Comparación de resultados

Mostrar el código
# Join all three dataframes by nombre_barrio and peligrosidad
final_df = renabap_tidy.merge(
    ghsl_tidy, on=["nombre_barrio", "peligrosidad"], how="outer"
)
final_df = final_df.merge(
    buildings_tidy, on=["nombre_barrio", "peligrosidad"], how="outer"
)

# Impute 0s for NA values in fam_expuestas columns
fam_expuestas_columns = [col for col in final_df.columns if 'fam_expuestas' in col]
final_df[fam_expuestas_columns] = final_df[fam_expuestas_columns].fillna(0)

# Create long format dataframe with aggregation
final_tidy = []

# Add renabap data
for _, row in renabap_tidy.iterrows():
    final_tidy.append(
        {
            "nombre_barrio": row["nombre_barrio"],
            "peligrosidad": row["peligrosidad"],
            "metodo": "area",
            "fam_expuestas": row["fam_expuestas_areal"],
        }
    )

# Add ghsl data
for _, row in ghsl_tidy.iterrows():
    final_tidy.append(
        {
            "nombre_barrio": row["nombre_barrio"],
            "peligrosidad": row["peligrosidad"],
            "metodo": "ghsl",
            "fam_expuestas": row["fam_expuestas_ghsl"],
        }
    )

# Add buildings data
for _, row in buildings_tidy.iterrows():
    final_tidy.append(
        {
            "nombre_barrio": row["nombre_barrio"],
            "peligrosidad": row["peligrosidad"],
            "metodo": "edificios",
            "fam_expuestas": row["fam_expuestas_edificios"],
        }
    )

final_tidy = pd.DataFrame(final_tidy)

# Aggregate to get one observation per barrio per hazard level per method
final_tidy = (
    final_tidy.groupby(["nombre_barrio", "peligrosidad", "metodo"])["fam_expuestas"]
    .sum()
    .reset_index()
)

# Create complete combination of all barrios, hazard levels, and methods
all_barrios = final_tidy["nombre_barrio"].unique()
all_hazard_levels = ["alta", "baja", "media"]
all_methods = ["area", "ghsl", "edificios"]

complete_combinations = pd.DataFrame([
    {"nombre_barrio": barrio, "peligrosidad": hazard, "metodo": method}
    for barrio in all_barrios
    for hazard in all_hazard_levels
    for method in all_methods
])

# Merge with actual data and fill missing values with 0
final_tidy = complete_combinations.merge(
    final_tidy, on=["nombre_barrio", "peligrosidad", "metodo"], how="left"
)
final_tidy["fam_expuestas"] = final_tidy["fam_expuestas"].fillna(0)

# Calculate total exposure per hazard level per method
summary = (
    final_tidy.groupby(["peligrosidad", "metodo"])["fam_expuestas"]
    .sum()
    .reset_index()
    .pivot(index="peligrosidad", columns="metodo", values="fam_expuestas")
)

print("Total Familias Expuestas por Peligrosidad y Método:")
print("=" * 50)
print(summary.round(2))

import matplotlib.pyplot as plt
import seaborn as sns

# Filter for high exposure (alta peligrosidad)
alta_data = final_tidy[final_tidy["peligrosidad"] == "alta"].copy()

# Calculate total exposure per barrio across all methods
total_exposure = (
    alta_data.groupby("nombre_barrio")["fam_expuestas"]
    .sum()
    .sort_values(ascending=False)
)
top_25_barrios = total_exposure.head(25).index

# Filter data for top 25 barrios
top_25_data = alta_data[
    alta_data["nombre_barrio"].isin(top_25_barrios)
].copy()

# Create range plot showing min, max, and individual points
plt.figure(figsize=(15, 10))

# Define colors for methods
method_colors = {"area": "blue", "ghsl": "red", "edificios": "green"}

for i, barrio in enumerate(top_25_barrios):
    barrio_data = top_25_data[top_25_data["nombre_barrio"] == barrio]
    if len(barrio_data) > 0:
        values = barrio_data["fam_expuestas"].values
        min_val = values.min()
        max_val = values.max()

        # Plot range line
        plt.plot([min_val, max_val], [i, i], "k-", alpha=0.5, linewidth=2)

        # Plot individual points colored by method
        for _, row in barrio_data.iterrows():
            color = method_colors[row["metodo"]]
            plt.plot(row["fam_expuestas"], i, "o", color=color, markersize=6, alpha=0.8)

plt.yticks(range(len(top_25_barrios)), top_25_barrios)
plt.xlabel("Familias Expuestas")
plt.ylabel("Barrio")
plt.title("Range of High Exposure Estimates for Top 25 Barrios", fontsize=14)
plt.grid(True, alpha=0.3)

# Add legend
legend_elements = [
    plt.Line2D(
        [0],
        [0],
        marker="o",
        color="w",
        markerfacecolor=color,
        markersize=8,
        label=method,
    )
    for method, color in method_colors.items()
]
plt.legend(handles=legend_elements, title="Método")

plt.tight_layout()
plt.show()
Total Familias Expuestas por Peligrosidad y Método:
==================================================
metodo           area  edificios     ghsl
peligrosidad                             
alta          1909.76    3606.85  2831.97
baja          5315.18    9852.58  7726.58
media         3599.65    9717.37  8400.36

3.5 Conclusiones y Recomendaciones

3.6 Referencias